Parallel computing enabled, please close all progeams except R

registerDoParallel(cores = detectCores()-2)

1. Data selection

1.1 Clean data

  • Uncomment those line only if you downloaded the original data, otherwise use the reduced size data
   #import original data
#craglistall = read.csv("Craglist.csv")
#craglistall[craglistall == ""] <- NA
#craglistall <- drop_na(craglistall)
#nrow(craglistall)

   #export complete case data to csv, this will be the reduced size file for submission
#write.csv(craglistall,"craglistall.csv", row.names = FALSE)

   #import complete case data
craglistcomplete = read.csv("craglistall.csv")
   #recode cylinder and as.numeric for columns
craglistcomplete$cylinders[craglistcomplete$cylinders == "other"] <- "0"
craglistcomplete$cylinders <- as.numeric(gsub(" cylinders" ,"",craglistcomplete$cylinders))
craglistcomplete <- craglistcomplete%>%mutate_at(c('price', 'year', 'odometer'), as.numeric)
craglistcomplete$price[craglistcomplete$price <= 1000] <- NA
craglistcomplete$price[craglistcomplete$price >= 100000] <- NA
craglistcomplete$price[craglistcomplete$odometer >= 500000] <- NA
craglistcomplete$transmission[craglistcomplete$transmission == "other"] <- NA
craglistcomplete <- drop_na(craglistcomplete)
names(craglistcomplete)
##  [1] "region"       "price"        "year"         "manufacturer" "model"       
##  [6] "condition"    "cylinders"    "fuel"         "odometer"     "title_status"
## [11] "transmission" "drive"        "size"         "type"         "paint_color" 
## [16] "description"  "state"        "lat"          "long"

1.2 Recode to levels

#get condition levels, recode to condition score as follow, better contidion better score, 6 levels
craglistcomplete$condition <-  as.integer(ifelse(craglistcomplete$condition == "new",6,ifelse(craglistcomplete$condition == "like new",5,ifelse(craglistcomplete$condition == "excellent",4,ifelse(craglistcomplete$condition == "good",3, ifelse(craglistcomplete$condition == "fair",2,ifelse(craglistcomplete$condition == "salvage",1,0)))))))

#get size levels, larger size higher socore, 4 levels
craglistcomplete$size <-  as.integer(ifelse(craglistcomplete$size == "full-size",4,ifelse(craglistcomplete$size == "mid-size",3, ifelse(craglistcomplete$size == "compact",2,ifelse(craglistcomplete$size == "sub-compact",1,0)))))

1.3 Grouping, get dummies

#recode title_status in to clean == 1 , others == 0 (not clean)
craglistcomplete$title_status <-  as.integer(ifelse(craglistcomplete$title_status == "clean", 1, 0))

#recode transmission in to auto == 1 , manual == 0
craglistcomplete$transmission <-  as.integer(ifelse(craglistcomplete$transmission == "automatic", 1, 0))

#get dummies for "fuel", "type", "drive"
craglist.grp <- dummy_cols(craglistcomplete, select_columns=c("fuel", "type", "drive"), remove_selected_columns = TRUE)
#head(craglist.grp)

#there's column name that will cause trouble in random forest model
colnames(craglist.grp)[26] <- "type_minivan"

2. Visualization

2.1 Price ~ type box plot

plot21<-ggplot(craglistcomplete)+geom_boxplot(mapping =aes(x=type, y=price), fill = "#FFDB6D", color = "#C4961A")
plot21

  • By comparing the box plots, Trucks, pickup and off-roads have the highest price. Sedan, wagon and mini van’s price are the lowest. We can see car size has a influence on the price. The larger the car size, the more expensive the price.

2.2 Price ~ fuel type plot

Fuel_Price <- craglistcomplete %>% group_by(fuel) %>% summarise(avgprice = mean(price))
ggplot(data = Fuel_Price) + geom_bar(mapping = aes(x = fuel, y = avgprice), stat = "identity", fill = "#F59811")  + theme_light() + labs(x = "fuel type") + ggtitle("Avg price by fuel type") + theme(plot.title = element_text(hjust = 0.5)) 

2.3 Geographical comparison

2.3.1 State average price comparison
#group by state and price
group261 <- subset(craglistcomplete, select = c(state, price))
#filtering for average
state_plot <- group261 %>% group_by(state) %>% summarise(avgprice = mean(price)) 
state_plot 
## # A tibble: 51 × 2
##    state avgprice
##    <chr>    <dbl>
##  1 ak      24296.
##  2 al      16438.
##  3 ar      15468.
##  4 az      14111.
##  5 ca      13998.
##  6 co      12261.
##  7 ct      11047.
##  8 dc      10300.
##  9 de      14253.
## 10 fl      14645.
## # … with 41 more rows
plot_usmap(data = state_plot , values = "avgprice", color = "coral") + 
  scale_fill_continuous(
    low = "white", high = "red", name = "Average Price", label = scales::comma
  ) + theme(legend.position = "right")

2.3.2 City comparison
#Google Map API access key
register_google(key = "AIzaSyDuBhfsCMjhNKhxV2J1lkQjF3vTRO372tQ")
#Get pittsburgh map
Pitt <- get_map(location = "Pittsburgh", zoom = 10, source="google", maptype = "roadmap")
#Get columbus map
Cbus <- get_map(location = "Columbus", zoom = 10, source="google", maptype = "roadmap")
PittMap <- ggmap(Pitt) + geom_point(aes(long, lat, color = drive), data = craglistcomplete) + ggtitle("Pittsburgh Drive Train") + xlab("Longitude") + ylab("Latitude") + theme(plot.title = element_text(hjust = 0.5))

CbusMap <- ggmap(Cbus) + geom_point(aes(long, lat, color = drive), data = craglistcomplete) + ggtitle("Columbus Drive Train") + xlab("Longitude") + ylab("Latitude") + theme(plot.title = element_text(hjust = 0.5))

PittMap; CbusMap

2.4 Average Price comparison

2.4.1 Price ~ Year and type
Fuel_Price <- craglistcomplete %>% group_by(type,year) %>% summarise_if(is.numeric, mean, na.rm=TRUE) %>% filter(year<=2020 & year>=2000)

plot231 <- ggplot(data=Fuel_Price, aes(x=year, y = price, group=type, fill=type)) + geom_area(alpha=0.4 , size=0.5, colour="black")
plot231

2.4.2 Price ~ Year and odometer
sdf <- craglistcomplete %>% group_by(type,year,region) %>% summarise(count_n = n(),avgprice = mean(price), avgodometer = mean(odometer)) %>% filter(year>=1950)
sdf
## # A tibble: 29,136 × 6
## # Groups:   type, year [751]
##    type   year region                   count_n avgprice avgodometer
##    <chr> <dbl> <chr>                      <int>    <dbl>       <dbl>
##  1 bus    1968 southeast missouri             1     5500       12345
##  2 bus    1970 sarasota-bradenton             1    32500       86653
##  3 bus    1975 dallas / fort worth            1    29999      100000
##  4 bus    1976 redding                        1     5500       55207
##  5 bus    1978 kalispell                      1     9500           0
##  6 bus    1978 kennewick-pasco-richland       1    25000       78000
##  7 bus    1978 oregon coast                   1    25000       78000
##  8 bus    1979 phoenix                        1    23800        1234
##  9 bus    1982 houston                        1    11000       82000
## 10 bus    1984 fort collins / north CO        1     7950      136543
## # … with 29,126 more rows
plot232 <- ggplot(sdf %>% filter(year>=1950), aes(x = avgodometer, y = avgprice, color = type)) + geom_point(aes(size = count_n,frame = year)) + scale_x_log10() + ggtitle("Listed car price ~ Year and odometer") + theme(plot.title = element_text(hjust = 0.5))
# Using frame = year, we specify the datapoints for each frame
ggplotly(plot232)
2.4.3 Price ~ Make
#group by mfg and price
group233 <- subset(craglistcomplete, select = c(manufacturer, price))
#filtering for average
mfg_plot <- group233 %>% group_by(manufacturer) %>% summarise(avgprice = mean(price)) 
mfg_plot 
## # A tibble: 41 × 2
##    manufacturer avgprice
##    <chr>           <dbl>
##  1 acura           9901.
##  2 alfa-romeo     21796.
##  3 aston-martin   53367 
##  4 audi           14416.
##  5 bmw            13478.
##  6 buick           9467.
##  7 cadillac       12655.
##  8 chevrolet      15866.
##  9 chrysler        8129.
## 10 datsun         13435.
## # … with 31 more rows
plot233 <- ggplot(data = mfg_plot) + geom_bar(aes(x = manufacturer,  y = avgprice), show.legend = FALSE, stat = "identity", fill = "#79A5FF") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Manufacturer's car Avg Price") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
plot233 

2.4.4 Price ~ Drive Train
#group by
group234 <- subset(craglistcomplete, select = c(drive, price, region, transmission))
#filtering for average
drive_plot <- group234 %>% group_by(drive, region, transmission)%>% summarise(avgprice = mean(price)) %>% filter(region == "columbus" | region == "pittsburgh" | region == "cleveland" | region == "philadelphia" | region == "cincinnati" | region == "harrisburg")

plot234 <- ggplot(data = drive_plot) + geom_bar(mapping = aes(x = region, y = avgprice, fill = transmission), stat = "identity") + facet_wrap(~drive) + theme_light() + labs(x = "Drive train across cities, 1 is automatic transmission") + ggtitle("Average price by drive train") + theme(plot.title = element_text(hjust = 0.5)) + theme(axis.text.x = element_text(angle = 90))
plot234

3. Price prediction

3.1 Subset for prediction

#Remove variables not needed for modeling
craglist.sub <- subset(craglist.grp, select = -c(region, manufacturer, model, paint_color, description, state))

3.2 Split data into train and test

set.seed(114514, sample.kind = "Rejection")
split = sample(nrow(craglist.sub),0.8*nrow(craglist.sub))
train = craglist.sub[split,]
test = craglist.sub[-split,]

3.3 Linear model and performance

3.1.1 Correlation check
M <- cor(train)
corrplot(M, method = "color",  tl.cex = 0.7)

3.1.2 Linear Model
  • Observed diesel correlated with gas, fwd and sedan correlated with many things, remove them from linear model to comply with assumptions
#Linear model
lm <- lm(price ~ .- fuel_diesel - drive_fwd - type_sedan , data = train)
summary(lm)
## 
## Call:
## lm(formula = price ~ . - fuel_diesel - drive_fwd - type_sedan, 
##     data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34651  -4227   -745   3137  97538 
## 
## Coefficients:
##                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)      -5.473e+05  7.588e+03  -72.119  < 2e-16 ***
## year              2.777e+02  3.772e+00   73.621  < 2e-16 ***
## condition         1.895e+03  4.563e+01   41.537  < 2e-16 ***
## cylinders         1.276e+03  2.755e+01   46.329  < 2e-16 ***
## odometer         -8.687e-02  5.664e-04 -153.381  < 2e-16 ***
## title_status      2.036e+03  1.325e+02   15.364  < 2e-16 ***
## transmission     -1.740e+03  1.298e+02  -13.406  < 2e-16 ***
## size              7.027e+02  5.198e+01   13.518  < 2e-16 ***
## lat              -2.249e+01  5.814e+00   -3.868  0.00011 ***
## long             -4.021e+01  1.799e+00  -22.355  < 2e-16 ***
## fuel_electric    -1.202e+03  8.471e+02   -1.419  0.15592    
## fuel_gas         -1.192e+04  1.367e+02  -87.215  < 2e-16 ***
## fuel_hybrid      -9.647e+03  3.217e+02  -29.993  < 2e-16 ***
## fuel_other       -1.263e+04  8.551e+02  -14.773  < 2e-16 ***
## type_bus          3.769e+03  8.215e+02    4.588 4.49e-06 ***
## type_convertible  4.264e+03  2.098e+02   20.318  < 2e-16 ***
## type_coupe        4.708e+03  1.608e+02   29.279  < 2e-16 ***
## type_hatchback    8.058e+01  1.751e+02    0.460  0.64533    
## type_minivan      1.694e+03  2.162e+02    7.837 4.69e-15 ***
## type_offroad      4.810e+03  4.867e+02    9.883  < 2e-16 ***
## type_other        1.999e+03  4.126e+02    4.846 1.26e-06 ***
## type_pickup       4.146e+03  1.457e+02   28.444  < 2e-16 ***
## type_SUV          1.313e+03  1.021e+02   12.855  < 2e-16 ***
## type_truck        6.560e+03  1.278e+02   51.332  < 2e-16 ***
## type_van          4.064e+03  1.877e+02   21.649  < 2e-16 ***
## type_wagon        2.807e+02  2.384e+02    1.177  0.23906    
## drive_4wd         4.471e+03  9.984e+01   44.778  < 2e-16 ***
## drive_rwd         1.901e+03  1.078e+02   17.633  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7622 on 58034 degrees of freedom
## Multiple R-squared:  0.5974, Adjusted R-squared:  0.5972 
## F-statistic:  3189 on 27 and 58034 DF,  p-value: < 2.2e-16
3.1.3 Linear model performance in test data
test$predlm = predict(lm, newdata = test)
train.mean = mean(train$price)
SSElm = sum((test$predlm - test$price)^2)
SSTlm = sum((train.mean - test$price)^2)
MAElm = mean(abs(test$price - test$predlm))
OSRlm = 1 - SSElm/SSTlm
OSRlm
## [1] 0.6002058
MAElm
## [1] 5270.548

3.4 Regression tree

3.4.1 Base tree
set.seed(114514, sample.kind = "Rejection")
basetree = rpart(price ~ ., data=train, method="anova",minbucket=50,cp=0.05)
plotcp(basetree)

rpart.plot(basetree, digits=-5, fallen.leaves = T)

3.4.2 Adjust the tree
#Adjust the cp and minibucket
adjtree = rpart(price ~ ., data=train, method="anova",minbucket=5,cp=0.001)
rpart.plot(basetree, digits=-5, fallen.leaves = T)

3.4.3 Tree performance in test data
test$predrt = predict(adjtree, newdata = test)
SSErt = sum((test$predrt - test$price)^2)
SSTrt = sum((train.mean - test$price)^2)
MAErt = mean(abs(test$price - test$predrt))
OSRrt = 1 - SSErt/SSTrt
OSRrt
## [1] 0.734258
MAErt
## [1] 4123.442

3.5 Random forest

3.5.1 Random forest 300 trees
set.seed(114514, sample.kind = "Rejection")
baserf = randomForest(price~., data=train, ntree=300, nodesize=20, mtry=4)
3.5.2 Plot and tune for mtry
x = train[,-1]
y = train$price
set.seed(114514, sample.kind="Rejection")
tuneRF(x, y, mtryStart = 4, stepFactor = 2, ntreeTry=300, nodesize=20, improve=0.01)
## mtry = 4  OOB error = 28614929 
## Searching left ...
## mtry = 2     OOB error = 49289891 
## -0.7225236 0.01 
## Searching right ...
## mtry = 8     OOB error = 21468184 
## 0.2497558 0.01 
## mtry = 16    OOB error = 20520460 
## 0.04414552 0.01 
## mtry = 30    OOB error = 21022298 
## -0.02445548 0.01

##    mtry OOBError
## 2     2 49289891
## 4     4 28614929
## 8     8 21468184
## 16   16 20520460
## 30   30 21022298
3.5.3 Tuned Random Forest
#Tuned rf
rf.final = randomForest(price~., data = train, ntree=300, nodesize=20, mtry=16)
varImpPlot(rf.final)

3.5.4 Forest performance in test data
test$predrf = predict(rf.final, newdata = test)
SSErf = sum((test$predrf - test$price)^2)
SSTrf = sum((train.mean - test$price)^2)
MAErf = mean(abs(test$price - test$predrf))
OSRrf = 1 - SSErf/SSTrf
OSRrf
## [1] 0.8630558
MAErf
## [1] 2576.654
3.5.5 Forest Interpertation
  • Partial dependence plot
par(mfrow=c(2,2))
partialPlot(rf.final, test , condition)
partialPlot(rf.final, test , cylinders)
partialPlot(rf.final, test , year)
partialPlot(rf.final, test , odometer)

3.6 XGBoost

  • What is XGBoost? An implementation of gradient boosted decision trees designed for speed and performance.
  • Why we are using it? To boost model performance and check if there’s a better fitment or prediction performance.
3.6.1 XGB model
set.seed(114514, sample.kind = "Rejection")
ctrl <- trainControl(method = "repeatedcv", repeats = 5) 
# Repeat 5 k-fold cross-validation
xgb_ctrl <- train(price ~ ., data = train, method = "xgbTree",verbosity = 0, trControl = ctrl)
#visualizing XGB
xgb.plot.tree(model = xgb_ctrl$finalModel, trees = 1)
3.6.2 XGB performance in test data
test$predxgb = predict(xgb_ctrl, newdata = test)
SSExgb = sum((test$predxgb - test$price)^2)
SSTxgb = sum((train.mean - test$price)^2)
MAExgb = mean(abs(test$price - test$predxgb))
OSRxgb = 1 - SSExgb/SSTxgb
OSRxgb
## [1] 0.8305857
MAExgb
## [1] 3169.467

4. Model comparision selection

table <- matrix(c(MAElm, OSRlm, MAErt, OSRrt, MAErf, OSRrf, MAExgb, OSRxgb), ncol = 2, byrow = T)
colnames(table) <- c('MAE', 'R-Square')
rownames(table) <- c('Linear Model','Regression Tree','Random Forest','XGBoost')
round(table, digits = 2)
##                     MAE R-Square
## Linear Model    5270.55     0.60
## Regression Tree 4123.44     0.73
## Random Forest   2576.65     0.86
## XGBoost         3169.47     0.83

5. Text mining for description

5.1 Subset data for text mining

craglist.text1 <- craglistcomplete$description
craglist.text1 <- gsub('[\t\n]','',craglist.text1)
craglist.text1 <- gsub('[^[:alnum:] ]','',craglist.text1)
craglist.text1 <- gsub('[[:digit:]]', '', craglist.text1)
Car_description <- data.frame(line = 1:nrow(craglistcomplete),as.character(craglist.text1))
Car_description$text <- Car_description$as.character.craglist.text1.
Car_description$as.character.craglist.text1. <- NULL
Car_description <- Car_description %>% unnest_tokens(word, text)
5.1.1 Remove Stop Words
Car_description <- Car_description %>% anti_join(stop_words, by = c(word = "word"))

5.2 Wordcloud

word_counts <- Car_description %>%
  count(word, sort = TRUE) 
#word_counts
wordcloud2(word_counts, color = "random-light", backgroundColor = "white")

5.3 Sentiment analysis

Car_description %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>% # acast(): Cast A Molten Data Frame Into An Array Or Data Frame.
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 150)